import requests
import json
import re

from Queue import Queue
from threading import Thread,Lock
from lxml import etree

'''  嗅事百科文字页面的爬取，存储在文本里面，夜可以存储在数据库中 '''

def create_url():
    ''' 获取需要提取页面的url存入队列中 '''

    base_url = 'https://www.qiushibaike.com/text/page/'
    url_queue = Queue()

    for i in range(1, 14):
        url_queue.put(base_url+'%s/'%i)

    return url_queue



def crawl_data(url, html_queue):

    lock = Lock()

    headers = {'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)'}

    request = requests.get(url, headers=headers)

    html = request.text

    lock.acquire()
    html_queue.put(html)
    lock.release()


def analy_data(html, analy_queue):
    lock = Lock()
    content = etree.HTML(html)
    node_list = content.xpath("//div[contains(@id, 'qiushi_tag')]")

    data = []

    for node in node_list:
        dic = {
            "text" : node.xpath(".//div[@class='content']/span")[0].text,
            "good" : node.xpath(".//span[@class='stats-vote']/i/text()")[0],
            "discuss" : node.xpath(".//span[@class='stats-comments']/a/i/text()")[0]
        }
        data.append(dic)
    lock.acquire()
    analy_queue.put(data)
    lock.release()



def deal_data(contents):

    lock = Lock()

    for content in contents:
        text = json.dumps(content, ensure_ascii=False)
        lock.acquire()
        with open('xiushi.txt', 'a') as f:
            f.write(text.encode('utf-8'))
            f.write('\n')
        lock.release()

def main():
    # 实现一个调度器，中心思想是消费者和生产者模型

    url_queue = create_url()
    html_queue = Queue()
    analy_queue = Queue()

    while not url_queue.empty():
        t = Thread(target=crawl_data,args=(url_queue.get(), html_queue))
        t.start()
        t.join()

    while not html_queue.empty():
        t = Thread(target=analy_data, args=(html_queue.get(), analy_queue))
        t.start()
        t.join()

    while not analy_queue.empty():
        t = Thread(target=deal_data, args=(analy_queue.get(),))
        t.start()
        t.join()


if __name__ == '__main__':
    main()
